Import & Explore the Dataset

diabetes.df <- read.csv("diabetes.csv", header = T)
diabetes.df <- na.omit(diabetes.df)
diabetes.df$Outcome <- factor(diabetes.df$Outcome)
summary(diabetes.df)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##  Outcome
##  0:500  
##  1:268  
##         
##         
##         
## 

Split the Data Into Training and Test Dataset

data.size<-nrow(diabetes.df)
train.size<-0.60

set.seed(12345)

train.row.nums<-sample(1:data.size, data.size*train.size, replace=FALSE)
train.data<-subset(diabetes.df[train.row.nums,])

test.row.nums<-setdiff(1:data.size,train.row.nums)
test.data<-subset(diabetes.df[test.row.nums,])

true.labels<-test.data[,9]

Create the Logistic Regression Model

mod1 <-glm(Outcome~.,data=train.data, family=binomial(logit))
summary(mod1)
## 
## Call:
## glm(formula = Outcome ~ ., family = binomial(logit), data = train.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5127  -0.7455  -0.4638   0.7726   2.8965  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -7.5705262  0.8731690  -8.670  < 2e-16 ***
## Pregnancies               0.0975515  0.0423826   2.302  0.02135 *  
## Glucose                   0.0327410  0.0046987   6.968 3.21e-12 ***
## BloodPressure            -0.0099185  0.0068293  -1.452  0.14640    
## SkinThickness             0.0037903  0.0086366   0.439  0.66076    
## Insulin                  -0.0008118  0.0010887  -0.746  0.45590    
## BMI                       0.0654534  0.0183843   3.560  0.00037 ***
## DiabetesPedigreeFunction  0.9047769  0.3684175   2.456  0.01406 *  
## Age                       0.0156362  0.0118213   1.323  0.18593    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 586.51  on 459  degrees of freedom
## Residual deviance: 447.69  on 451  degrees of freedom
## AIC: 465.69
## 
## Number of Fisher Scoring iterations: 5
mod2 <- mod1
mod2 <- step(mod2)
## Start:  AIC=465.69
## Outcome ~ Pregnancies + Glucose + BloodPressure + SkinThickness + 
##     Insulin + BMI + DiabetesPedigreeFunction + Age
## 
##                            Df Deviance    AIC
## - SkinThickness             1   447.89 463.89
## - Insulin                   1   448.25 464.25
## - Age                       1   449.44 465.44
## <none>                          447.69 465.69
## - BloodPressure             1   449.82 465.82
## - Pregnancies               1   453.09 469.09
## - DiabetesPedigreeFunction  1   453.94 469.94
## - BMI                       1   461.49 477.49
## - Glucose                   1   508.33 524.33
## 
## Step:  AIC=463.89
## Outcome ~ Pregnancies + Glucose + BloodPressure + Insulin + BMI + 
##     DiabetesPedigreeFunction + Age
## 
##                            Df Deviance    AIC
## - Insulin                   1   448.27 462.27
## - Age                       1   449.58 463.58
## - BloodPressure             1   449.85 463.85
## <none>                          447.89 463.89
## - Pregnancies               1   453.23 467.23
## - DiabetesPedigreeFunction  1   454.38 468.38
## - BMI                       1   464.09 478.09
## - Glucose                   1   508.78 522.78
## 
## Step:  AIC=462.27
## Outcome ~ Pregnancies + Glucose + BloodPressure + BMI + DiabetesPedigreeFunction + 
##     Age
## 
##                            Df Deviance    AIC
## - Age                       1   450.20 462.20
## <none>                          448.27 462.27
## - BloodPressure             1   450.37 462.37
## - Pregnancies               1   453.83 465.83
## - DiabetesPedigreeFunction  1   454.59 466.59
## - BMI                       1   464.12 476.12
## - Glucose                   1   512.93 524.93
## 
## Step:  AIC=462.2
## Outcome ~ Pregnancies + Glucose + BloodPressure + BMI + DiabetesPedigreeFunction
## 
##                            Df Deviance    AIC
## - BloodPressure             1   451.69 461.69
## <none>                          450.20 462.20
## - DiabetesPedigreeFunction  1   456.56 466.56
## - Pregnancies               1   463.34 473.34
## - BMI                       1   464.87 474.87
## - Glucose                   1   527.81 537.81
## 
## Step:  AIC=461.69
## Outcome ~ Pregnancies + Glucose + BMI + DiabetesPedigreeFunction
## 
##                            Df Deviance    AIC
## <none>                          451.69 461.69
## - DiabetesPedigreeFunction  1   458.49 466.49
## - Pregnancies               1   463.75 471.75
## - BMI                       1   465.15 473.15
## - Glucose                   1   527.86 535.86
summary(mod2)
## 
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BMI + DiabetesPedigreeFunction, 
##     family = binomial(logit), data = train.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7054  -0.7262  -0.4730   0.7971   2.9196  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -7.603533   0.795940  -9.553  < 2e-16 ***
## Pregnancies               0.121851   0.035429   3.439 0.000583 ***
## Glucose                   0.032570   0.004182   7.788 6.83e-15 ***
## BMI                       0.059150   0.016823   3.516 0.000438 ***
## DiabetesPedigreeFunction  0.928488   0.361494   2.568 0.010215 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 586.51  on 459  degrees of freedom
## Residual deviance: 451.69  on 455  degrees of freedom
## AIC: 461.69
## 
## Number of Fisher Scoring iterations: 4

Make Predictions From the Logistic Model & Generate Confusion Matrix

fit.pred1 <- predict(mod2,test.data, type = "response")
class.threshold<-0.5
pred.labels<-rep(0,length(true.labels))
pred.labels[fit.pred1>class.threshold]=1
pred.labels <- factor(pred.labels)
confusionMatrix(pred.labels,true.labels)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 179  48
##          1  15  66
##                                          
##                Accuracy : 0.7955         
##                  95% CI : (0.746, 0.8391)
##     No Information Rate : 0.6299         
##     P-Value [Acc > NIR] : 2.434e-10      
##                                          
##                   Kappa : 0.5335         
##                                          
##  Mcnemar's Test P-Value : 5.539e-05      
##                                          
##             Sensitivity : 0.9227         
##             Specificity : 0.5789         
##          Pos Pred Value : 0.7885         
##          Neg Pred Value : 0.8148         
##              Prevalence : 0.6299         
##          Detection Rate : 0.5812         
##    Detection Prevalence : 0.7370         
##       Balanced Accuracy : 0.7508         
##                                          
##        'Positive' Class : 0              
## 

Create a Decision Tree

diabetes.tree <- rpart(Outcome~., data = diabetes.df, method = "class")
rpart.plot(diabetes.tree,box.palette="RdBu", shadow.col="gray", nn = TRUE)

fit.pred2 <- predict(diabetes.tree, test.data, type = "class")
confusionMatrix(fit.pred2,true.labels)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 173  23
##          1  21  91
##                                          
##                Accuracy : 0.8571         
##                  95% CI : (0.813, 0.8942)
##     No Information Rate : 0.6299         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.6925         
##                                          
##  Mcnemar's Test P-Value : 0.8802         
##                                          
##             Sensitivity : 0.8918         
##             Specificity : 0.7982         
##          Pos Pred Value : 0.8827         
##          Neg Pred Value : 0.8125         
##              Prevalence : 0.6299         
##          Detection Rate : 0.5617         
##    Detection Prevalence : 0.6364         
##       Balanced Accuracy : 0.8450         
##                                          
##        'Positive' Class : 0              
## 

Choose the cp For the Tree & Prune the Tree

par(mfrow=c(1,2))
diabetes.tree2 <- tree(Outcome~., data = diabetes.df)
cv.diabetes <- cv.tree(diabetes.tree2)
plot(cv.diabetes$size, cv.diabetes$dev, type="b")
plot(cv.diabetes$k, cv.diabetes$dev, type="b")

summary(diabetes.tree)
## Call:
## rpart(formula = Outcome ~ ., data = diabetes.df, method = "class")
##   n= 768 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.24253731      0 1.0000000 1.0000000 0.04928752
## 2 0.10447761      1 0.7574627 0.8059701 0.04649230
## 3 0.01741294      2 0.6529851 0.7537313 0.04552694
## 4 0.01492537      5 0.6007463 0.7574627 0.04559916
## 5 0.01305970      9 0.5410448 0.7500000 0.04545421
## 6 0.01119403     12 0.4925373 0.7537313 0.04552694
## 7 0.01000000     15 0.4589552 0.7500000 0.04545421
## 
## Variable importance
##                  Glucose                      BMI                      Age 
##                       39                       17                       12 
##            BloodPressure                  Insulin              Pregnancies 
##                       10                        8                        6 
## DiabetesPedigreeFunction            SkinThickness 
##                        5                        2 
## 
## Node number 1: 768 observations,    complexity param=0.2425373
##   predicted class=0  expected loss=0.3489583  P(node) =1
##     class counts:   500   268
##    probabilities: 0.651 0.349 
##   left son=2 (485 obs) right son=3 (283 obs)
##   Primary splits:
##       Glucose     < 127.5  to the left,  improve=63.36011, (0 missing)
##       Age         < 28.5   to the left,  improve=33.99082, (0 missing)
##       BMI         < 29.85  to the left,  improve=32.92453, (0 missing)
##       Pregnancies < 6.5    to the left,  improve=19.69295, (0 missing)
##       Insulin     < 121    to the left,  improve=13.33926, (0 missing)
##   Surrogate splits:
##       Insulin                  < 121    to the left,  agree=0.697, adj=0.177, (0 split)
##       Age                      < 48.5   to the left,  agree=0.665, adj=0.092, (0 split)
##       BloodPressure            < 81     to the left,  agree=0.659, adj=0.074, (0 split)
##       BMI                      < 39.75  to the left,  agree=0.659, adj=0.074, (0 split)
##       DiabetesPedigreeFunction < 1.149  to the left,  agree=0.642, adj=0.028, (0 split)
## 
## Node number 2: 485 observations,    complexity param=0.01492537
##   predicted class=0  expected loss=0.1938144  P(node) =0.6315104
##     class counts:   391    94
##    probabilities: 0.806 0.194 
##   left son=4 (271 obs) right son=5 (214 obs)
##   Primary splits:
##       Age                      < 28.5   to the left,  improve=14.579100, (0 missing)
##       BMI                      < 26.95  to the left,  improve=10.747980, (0 missing)
##       Glucose                  < 99.5   to the left,  improve= 8.411871, (0 missing)
##       Pregnancies              < 6.5    to the left,  improve= 8.228052, (0 missing)
##       DiabetesPedigreeFunction < 0.659  to the left,  improve= 5.950521, (0 missing)
##   Surrogate splits:
##       Pregnancies   < 3.5    to the left,  agree=0.802, adj=0.551, (0 split)
##       BloodPressure < 71     to the left,  agree=0.658, adj=0.224, (0 split)
##       SkinThickness < 7.5    to the right, agree=0.627, adj=0.154, (0 split)
##       Insulin       < 7.5    to the right, agree=0.625, adj=0.150, (0 split)
##       Glucose       < 113.5  to the left,  agree=0.598, adj=0.089, (0 split)
## 
## Node number 3: 283 observations,    complexity param=0.1044776
##   predicted class=1  expected loss=0.385159  P(node) =0.3684896
##     class counts:   109   174
##    probabilities: 0.385 0.615 
##   left son=6 (76 obs) right son=7 (207 obs)
##   Primary splits:
##       BMI                      < 29.95  to the left,  improve=18.584530, (0 missing)
##       Glucose                  < 154.5  to the left,  improve=15.229510, (0 missing)
##       Age                      < 24.5   to the left,  improve= 7.413805, (0 missing)
##       DiabetesPedigreeFunction < 0.3165 to the left,  improve= 5.911141, (0 missing)
##       Pregnancies              < 7.5    to the left,  improve= 4.076871, (0 missing)
##   Surrogate splits:
##       Age                      < 21.5   to the left,  agree=0.742, adj=0.039, (0 split)
##       DiabetesPedigreeFunction < 0.1255 to the left,  agree=0.735, adj=0.013, (0 split)
## 
## Node number 4: 271 observations
##   predicted class=0  expected loss=0.08487085  P(node) =0.3528646
##     class counts:   248    23
##    probabilities: 0.915 0.085 
## 
## Node number 5: 214 observations,    complexity param=0.01492537
##   predicted class=0  expected loss=0.3317757  P(node) =0.2786458
##     class counts:   143    71
##    probabilities: 0.668 0.332 
##   left son=10 (41 obs) right son=11 (173 obs)
##   Primary splits:
##       BMI                      < 26.35  to the left,  improve=8.123435, (0 missing)
##       Glucose                  < 99.5   to the left,  improve=7.110539, (0 missing)
##       Insulin                  < 142.5  to the left,  improve=6.235009, (0 missing)
##       DiabetesPedigreeFunction < 0.625  to the left,  improve=3.627461, (0 missing)
##       Age                      < 56.5   to the right, improve=2.267415, (0 missing)
##   Surrogate splits:
##       Age < 66.5   to the right, agree=0.813, adj=0.024, (0 split)
## 
## Node number 6: 76 observations,    complexity param=0.01119403
##   predicted class=0  expected loss=0.3157895  P(node) =0.09895833
##     class counts:    52    24
##    probabilities: 0.684 0.316 
##   left son=12 (41 obs) right son=13 (35 obs)
##   Primary splits:
##       Glucose     < 145.5  to the left,  improve=5.112489, (0 missing)
##       Age         < 26.5   to the left,  improve=2.823058, (0 missing)
##       BMI         < 23.2   to the left,  improve=2.296651, (0 missing)
##       Pregnancies < 1.5    to the left,  improve=2.245614, (0 missing)
##       Insulin     < 177.5  to the right, improve=1.300847, (0 missing)
##   Surrogate splits:
##       BMI                      < 28.85  to the left,  agree=0.632, adj=0.200, (0 split)
##       Age                      < 23.5   to the left,  agree=0.632, adj=0.200, (0 split)
##       Pregnancies              < 3.5    to the left,  agree=0.592, adj=0.114, (0 split)
##       Insulin                  < 44.5   to the right, agree=0.592, adj=0.114, (0 split)
##       DiabetesPedigreeFunction < 0.2085 to the right, agree=0.592, adj=0.114, (0 split)
## 
## Node number 7: 207 observations,    complexity param=0.01741294
##   predicted class=1  expected loss=0.2753623  P(node) =0.2695312
##     class counts:    57   150
##    probabilities: 0.275 0.725 
##   left son=14 (115 obs) right son=15 (92 obs)
##   Primary splits:
##       Glucose                  < 157.5  to the left,  improve=6.956522, (0 missing)
##       DiabetesPedigreeFunction < 0.309  to the left,  improve=3.715521, (0 missing)
##       BloodPressure            < 61     to the right, improve=2.965253, (0 missing)
##       Age                      < 24.5   to the left,  improve=2.783779, (0 missing)
##       Pregnancies              < 7.5    to the left,  improve=2.159170, (0 missing)
##   Surrogate splits:
##       Insulin                  < 183.5  to the left,  agree=0.594, adj=0.087, (0 split)
##       Age                      < 46.5   to the left,  agree=0.589, adj=0.076, (0 split)
##       DiabetesPedigreeFunction < 0.744  to the left,  agree=0.585, adj=0.065, (0 split)
##       BMI                      < 30.85  to the right, agree=0.580, adj=0.054, (0 split)
##       BloodPressure            < 103    to the left,  agree=0.570, adj=0.033, (0 split)
## 
## Node number 10: 41 observations
##   predicted class=0  expected loss=0.04878049  P(node) =0.05338542
##     class counts:    39     2
##    probabilities: 0.951 0.049 
## 
## Node number 11: 173 observations,    complexity param=0.01492537
##   predicted class=0  expected loss=0.3988439  P(node) =0.2252604
##     class counts:   104    69
##    probabilities: 0.601 0.399 
##   left son=22 (55 obs) right son=23 (118 obs)
##   Primary splits:
##       Glucose                  < 99.5   to the left,  improve=7.595901, (0 missing)
##       DiabetesPedigreeFunction < 0.625  to the left,  improve=6.657917, (0 missing)
##       Insulin                  < 142.5  to the left,  improve=5.270370, (0 missing)
##       BloodPressure            < 91     to the right, improve=1.895734, (0 missing)
##       Age                      < 56.5   to the right, improve=1.572004, (0 missing)
##   Surrogate splits:
##       DiabetesPedigreeFunction < 0.098  to the left,  agree=0.688, adj=0.018, (0 split)
## 
## Node number 12: 41 observations
##   predicted class=0  expected loss=0.1463415  P(node) =0.05338542
##     class counts:    35     6
##    probabilities: 0.854 0.146 
## 
## Node number 13: 35 observations,    complexity param=0.01119403
##   predicted class=1  expected loss=0.4857143  P(node) =0.04557292
##     class counts:    17    18
##    probabilities: 0.486 0.514 
##   left son=26 (21 obs) right son=27 (14 obs)
##   Primary splits:
##       Insulin       < 14.5   to the left,  improve=1.8666670, (0 missing)
##       Age           < 28.5   to the left,  improve=1.4486770, (0 missing)
##       BloodPressure < 74.5   to the right, improve=1.1958590, (0 missing)
##       BMI           < 25.55  to the right, improve=0.9657143, (0 missing)
##       Pregnancies   < 1.5    to the left,  improve=0.9142857, (0 missing)
##   Surrogate splits:
##       SkinThickness            < 7      to the left,  agree=0.943, adj=0.857, (0 split)
##       DiabetesPedigreeFunction < 0.315  to the left,  agree=0.686, adj=0.214, (0 split)
##       Age                      < 27.5   to the right, agree=0.657, adj=0.143, (0 split)
##       Pregnancies              < 1.5    to the right, agree=0.629, adj=0.071, (0 split)
##       BloodPressure            < 74.5   to the right, agree=0.629, adj=0.071, (0 split)
## 
## Node number 14: 115 observations,    complexity param=0.01741294
##   predicted class=1  expected loss=0.3913043  P(node) =0.1497396
##     class counts:    45    70
##    probabilities: 0.391 0.609 
##   left son=28 (50 obs) right son=29 (65 obs)
##   Primary splits:
##       Age                      < 30.5   to the left,  improve=3.911839, (0 missing)
##       BloodPressure            < 61     to the right, improve=3.635942, (0 missing)
##       DiabetesPedigreeFunction < 0.421  to the left,  improve=3.496289, (0 missing)
##       BMI                      < 41.65  to the left,  improve=2.717391, (0 missing)
##       Pregnancies              < 7.5    to the left,  improve=2.660742, (0 missing)
##   Surrogate splits:
##       Pregnancies              < 4.5    to the left,  agree=0.800, adj=0.54, (0 split)
##       BloodPressure            < 71     to the left,  agree=0.670, adj=0.24, (0 split)
##       Insulin                  < 186    to the right, agree=0.617, adj=0.12, (0 split)
##       BMI                      < 31.25  to the left,  agree=0.591, adj=0.06, (0 split)
##       DiabetesPedigreeFunction < 0.436  to the left,  agree=0.591, adj=0.06, (0 split)
## 
## Node number 15: 92 observations
##   predicted class=1  expected loss=0.1304348  P(node) =0.1197917
##     class counts:    12    80
##    probabilities: 0.130 0.870 
## 
## Node number 22: 55 observations
##   predicted class=0  expected loss=0.1818182  P(node) =0.07161458
##     class counts:    45    10
##    probabilities: 0.818 0.182 
## 
## Node number 23: 118 observations,    complexity param=0.01492537
##   predicted class=0  expected loss=0.5  P(node) =0.1536458
##     class counts:    59    59
##    probabilities: 0.500 0.500 
##   left son=46 (84 obs) right son=47 (34 obs)
##   Primary splits:
##       DiabetesPedigreeFunction < 0.561  to the left,  improve=5.288515, (0 missing)
##       BloodPressure            < 85     to the right, improve=2.615248, (0 missing)
##       Insulin                  < 142.5  to the left,  improve=2.187185, (0 missing)
##       Age                      < 57     to the right, improve=1.898327, (0 missing)
##       BMI                      < 34.65  to the right, improve=1.812039, (0 missing)
##   Surrogate splits:
##       Insulin       < 190.5  to the left,  agree=0.746, adj=0.118, (0 split)
##       BMI           < 43.35  to the left,  agree=0.729, adj=0.059, (0 split)
##       SkinThickness < 44.5   to the left,  agree=0.720, adj=0.029, (0 split)
## 
## Node number 26: 21 observations
##   predicted class=0  expected loss=0.3809524  P(node) =0.02734375
##     class counts:    13     8
##    probabilities: 0.619 0.381 
## 
## Node number 27: 14 observations
##   predicted class=1  expected loss=0.2857143  P(node) =0.01822917
##     class counts:     4    10
##    probabilities: 0.286 0.714 
## 
## Node number 28: 50 observations,    complexity param=0.01741294
##   predicted class=0  expected loss=0.46  P(node) =0.06510417
##     class counts:    27    23
##    probabilities: 0.540 0.460 
##   left son=56 (40 obs) right son=57 (10 obs)
##   Primary splits:
##       BloodPressure < 61     to the right, improve=7.290000, (0 missing)
##       Insulin       < 199    to the right, improve=2.347937, (0 missing)
##       BMI           < 41.8   to the left,  improve=2.014825, (0 missing)
##       SkinThickness < 19.5   to the right, improve=1.300317, (0 missing)
##       Pregnancies   < 0.5    to the right, improve=1.284444, (0 missing)
## 
## Node number 29: 65 observations
##   predicted class=1  expected loss=0.2769231  P(node) =0.08463542
##     class counts:    18    47
##    probabilities: 0.277 0.723 
## 
## Node number 46: 84 observations,    complexity param=0.0130597
##   predicted class=0  expected loss=0.4047619  P(node) =0.109375
##     class counts:    50    34
##    probabilities: 0.595 0.405 
##   left son=92 (21 obs) right son=93 (63 obs)
##   Primary splits:
##       DiabetesPedigreeFunction < 0.2    to the left,  improve=2.571429, (0 missing)
##       Age                      < 54.5   to the right, improve=2.502165, (0 missing)
##       BloodPressure            < 85     to the right, improve=2.221581, (0 missing)
##       BMI                      < 34.65  to the right, improve=2.114657, (0 missing)
##       SkinThickness            < 28     to the right, improve=1.895085, (0 missing)
##   Surrogate splits:
##       BloodPressure < 51     to the left,  agree=0.786, adj=0.143, (0 split)
##       SkinThickness < 47.5   to the right, agree=0.762, adj=0.048, (0 split)
## 
## Node number 47: 34 observations
##   predicted class=1  expected loss=0.2647059  P(node) =0.04427083
##     class counts:     9    25
##    probabilities: 0.265 0.735 
## 
## Node number 56: 40 observations,    complexity param=0.01119403
##   predicted class=0  expected loss=0.325  P(node) =0.05208333
##     class counts:    27    13
##    probabilities: 0.675 0.325 
##   left son=112 (31 obs) right son=113 (9 obs)
##   Primary splits:
##       BMI                      < 41.8   to the left,  improve=2.711290, (0 missing)
##       Insulin                  < 260    to the right, improve=2.453226, (0 missing)
##       DiabetesPedigreeFunction < 0.311  to the left,  improve=2.002381, (0 missing)
##       BloodPressure            < 73     to the right, improve=1.633333, (0 missing)
##       Pregnancies              < 0.5    to the right, improve=1.319231, (0 missing)
##   Surrogate splits:
##       BloodPressure < 84.5   to the left,  agree=0.825, adj=0.222, (0 split)
##       SkinThickness < 40.5   to the left,  agree=0.800, adj=0.111, (0 split)
## 
## Node number 57: 10 observations
##   predicted class=1  expected loss=0  P(node) =0.01302083
##     class counts:     0    10
##    probabilities: 0.000 1.000 
## 
## Node number 92: 21 observations
##   predicted class=0  expected loss=0.1904762  P(node) =0.02734375
##     class counts:    17     4
##    probabilities: 0.810 0.190 
## 
## Node number 93: 63 observations,    complexity param=0.0130597
##   predicted class=0  expected loss=0.4761905  P(node) =0.08203125
##     class counts:    33    30
##    probabilities: 0.524 0.476 
##   left son=186 (52 obs) right son=187 (11 obs)
##   Primary splits:
##       Pregnancies              < 1.5    to the right, improve=3.117383, (0 missing)
##       BloodPressure            < 67     to the right, improve=2.603571, (0 missing)
##       Age                      < 48     to the right, improve=2.309690, (0 missing)
##       SkinThickness            < 26.5   to the right, improve=1.928571, (0 missing)
##       DiabetesPedigreeFunction < 0.4255 to the right, improve=1.728571, (0 missing)
##   Surrogate splits:
##       SkinThickness < 45.5   to the left,  agree=0.857, adj=0.182, (0 split)
##       Insulin       < 193    to the left,  agree=0.857, adj=0.182, (0 split)
##       BMI           < 44.55  to the left,  agree=0.857, adj=0.182, (0 split)
## 
## Node number 112: 31 observations
##   predicted class=0  expected loss=0.2258065  P(node) =0.04036458
##     class counts:    24     7
##    probabilities: 0.774 0.226 
## 
## Node number 113: 9 observations
##   predicted class=1  expected loss=0.3333333  P(node) =0.01171875
##     class counts:     3     6
##    probabilities: 0.333 0.667 
## 
## Node number 186: 52 observations,    complexity param=0.0130597
##   predicted class=0  expected loss=0.4038462  P(node) =0.06770833
##     class counts:    31    21
##    probabilities: 0.596 0.404 
##   left son=372 (40 obs) right son=373 (12 obs)
##   Primary splits:
##       BloodPressure < 67     to the right, improve=3.738462, (0 missing)
##       Insulin       < 11     to the right, improve=2.611571, (0 missing)
##       SkinThickness < 26.5   to the right, improve=2.377855, (0 missing)
##       BMI           < 34.05  to the right, improve=2.377855, (0 missing)
##       Age           < 47.5   to the right, improve=2.286081, (0 missing)
## 
## Node number 187: 11 observations
##   predicted class=1  expected loss=0.1818182  P(node) =0.01432292
##     class counts:     2     9
##    probabilities: 0.182 0.818 
## 
## Node number 372: 40 observations
##   predicted class=0  expected loss=0.3  P(node) =0.05208333
##     class counts:    28    12
##    probabilities: 0.700 0.300 
## 
## Node number 373: 12 observations
##   predicted class=1  expected loss=0.25  P(node) =0.015625
##     class counts:     3     9
##    probabilities: 0.250 0.750
diabetes.prune <- prune(diabetes.tree, cp = 0.043)
plotcp(diabetes.tree)

Plot the Pruned Tree

rpart.plot(diabetes.prune,box.palette="RdBu", shadow.col="gray", nn = TRUE)

Generate the Correlation Matrix Using corrplot & plotly

corr <- rcorr(as.matrix(diabetes.df))
corr
##                          Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies                     1.00    0.13          0.14         -0.08
## Glucose                         0.13    1.00          0.15          0.06
## BloodPressure                   0.14    0.15          1.00          0.21
## SkinThickness                  -0.08    0.06          0.21          1.00
## Insulin                        -0.07    0.33          0.09          0.44
## BMI                             0.02    0.22          0.28          0.39
## DiabetesPedigreeFunction       -0.03    0.14          0.04          0.18
## Age                             0.54    0.26          0.24         -0.11
## Outcome                         0.22    0.47          0.07          0.07
##                          Insulin  BMI DiabetesPedigreeFunction   Age
## Pregnancies                -0.07 0.02                    -0.03  0.54
## Glucose                     0.33 0.22                     0.14  0.26
## BloodPressure               0.09 0.28                     0.04  0.24
## SkinThickness               0.44 0.39                     0.18 -0.11
## Insulin                     1.00 0.20                     0.19 -0.04
## BMI                         0.20 1.00                     0.14  0.04
## DiabetesPedigreeFunction    0.19 0.14                     1.00  0.03
## Age                        -0.04 0.04                     0.03  1.00
## Outcome                     0.13 0.29                     0.17  0.24
##                          Outcome
## Pregnancies                 0.22
## Glucose                     0.47
## BloodPressure               0.07
## SkinThickness               0.07
## Insulin                     0.13
## BMI                         0.29
## DiabetesPedigreeFunction    0.17
## Age                         0.24
## Outcome                     1.00
## 
## n= 768 
## 
## 
## P
##                          Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies                          0.0003  0.0000        0.0236       
## Glucose                  0.0003              0.0000        0.1124       
## BloodPressure            0.0000      0.0000                0.0000       
## SkinThickness            0.0236      0.1124  0.0000                     
## Insulin                  0.0416      0.0000  0.0137        0.0000       
## BMI                      0.6246      0.0000  0.0000        0.0000       
## DiabetesPedigreeFunction 0.3535      0.0001  0.2534        0.0000       
## Age                      0.0000      0.0000  0.0000        0.0016       
## Outcome                  0.0000      0.0000  0.0715        0.0383       
##                          Insulin BMI    DiabetesPedigreeFunction Age   
## Pregnancies              0.0416  0.6246 0.3535                   0.0000
## Glucose                  0.0000  0.0000 0.0001                   0.0000
## BloodPressure            0.0137  0.0000 0.2534                   0.0000
## SkinThickness            0.0000  0.0000 0.0000                   0.0016
## Insulin                          0.0000 0.0000                   0.2432
## BMI                      0.0000         0.0000                   0.3158
## DiabetesPedigreeFunction 0.0000  0.0000                          0.3530
## Age                      0.2432  0.3158 0.3530                         
## Outcome                  0.0003  0.0000 0.0000                   0.0000
##                          Outcome
## Pregnancies              0.0000 
## Glucose                  0.0000 
## BloodPressure            0.0715 
## SkinThickness            0.0383 
## Insulin                  0.0003 
## BMI                      0.0000 
## DiabetesPedigreeFunction 0.0000 
## Age                      0.0000 
## Outcome
t <- list(
  family = "Arial",
  size = 13,
  color = 'white')
corrplot(corr$r, type = "upper", order = "hclust",tl.col = "black", tl.srt = 45)

p <- plot_ly(z = cor(data.matrix(diabetes.df)), x = colnames(diabetes.df), y = colnames(diabetes.df), type = "heatmap", colorscale = "Electric", paper_bgcolor = "black") %>% 
  layout(paper_bgcolor = "black", title = "Correlation Matrix", font = t)
p
## Warning: 'heatmap' objects don't have these attributes: 'paper_bgcolor'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'z', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'transpose', 'xtype', 'ytype', 'zsmooth', 'connectgaps', 'xgap', 'ygap', 'zhoverformat', 'hovertemplate', 'zauto', 'zmin', 'zmax', 'zmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'hoverinfosrc', 'zsrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'hovertemplatesrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

Generate Confusion Matrix for the Decision Tree

fit.pred3 <- predict(diabetes.prune, test.data, type = "class")
confusionMatrix(fit.pred3,true.labels)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 175  45
##          1  19  69
##                                           
##                Accuracy : 0.7922          
##                  95% CI : (0.7426, 0.8361)
##     No Information Rate : 0.6299          
##     P-Value [Acc > NIR] : 5.558e-10       
##                                           
##                   Kappa : 0.5324          
##                                           
##  Mcnemar's Test P-Value : 0.001778        
##                                           
##             Sensitivity : 0.9021          
##             Specificity : 0.6053          
##          Pos Pred Value : 0.7955          
##          Neg Pred Value : 0.7841          
##              Prevalence : 0.6299          
##          Detection Rate : 0.5682          
##    Detection Prevalence : 0.7143          
##       Balanced Accuracy : 0.7537          
##                                           
##        'Positive' Class : 0               
## 

Create RandomForest Model

seed.val<-12345
rf.diabetes <- randomForest(Outcome~., data = train.data, mtry = 8, importance = T)
rf.diabetes.pred <- predict(rf.diabetes, newdata = test.data)

Create the Importance Plot & Confusion Matrix

importance(rf.diabetes)
##                                  0          1 MeanDecreaseAccuracy
## Pregnancies              10.467128  0.5704655            9.4726353
## Glucose                  30.957097 24.8050006           38.3498820
## BloodPressure             6.217104 -2.9484487            3.4257947
## SkinThickness             3.326062 -4.7388849           -0.2483231
## Insulin                  11.750394 -3.5926739            8.4321889
## BMI                      10.936904  8.8874452           13.6757860
## DiabetesPedigreeFunction  7.205492  5.8181582            9.0672109
## Age                      10.785277  2.4557878           10.5619575
##                          MeanDecreaseGini
## Pregnancies                      13.36084
## Glucose                          66.26529
## BloodPressure                    18.65839
## SkinThickness                    10.88403
## Insulin                          11.32194
## BMI                              32.09698
## DiabetesPedigreeFunction         28.08399
## Age                              24.08951
varImpPlot(rf.diabetes)

confusionMatrix(rf.diabetes.pred, true.labels)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 168  40
##          1  26  74
##                                           
##                Accuracy : 0.7857          
##                  95% CI : (0.7356, 0.8302)
##     No Information Rate : 0.6299          
##     P-Value [Acc > NIR] : 2.736e-09       
##                                           
##                   Kappa : 0.5285          
##                                           
##  Mcnemar's Test P-Value : 0.1096          
##                                           
##             Sensitivity : 0.8660          
##             Specificity : 0.6491          
##          Pos Pred Value : 0.8077          
##          Neg Pred Value : 0.7400          
##              Prevalence : 0.6299          
##          Detection Rate : 0.5455          
##    Detection Prevalence : 0.6753          
##       Balanced Accuracy : 0.7576          
##                                           
##        'Positive' Class : 0               
##